
  ___  ____  ____  ____  ____ ®
 /__    /   ____/   /   ____/      18.0
___/   /   /___/   /   /___/       MP—Parallel Edition

 Statistics and Data Science       Copyright 1985-2023 StataCorp LLC
                                   StataCorp
                                   4905 Lakeway Drive
                                   College Station, Texas 77845 USA
                                   800-STATA-PC        https://www.stata.com
                                   979-696-4600        stata@stata.com

Stata license: 22-user 8-core network, expiring 30 Jun 2025
Serial number: 501809309427
  Licensed to: The University of Chicago Booth School of Business
               Mercury Computing Cluster

Notes:
      1. Stata is running in batch mode.
      2. Unicode is supported; see help unicode_advice.
      3. More than 2 billion observations are allowed; see help obs_advice.
      4. Maximum number of variables is set to 5,000 but can be increased;
          see help set_maxvar.

. do "/project/fagoolsb/git/service_industries/replication_package/builds/4_hom
> ebase_build/4_1_homebase_build.do" 

. * Author: Joe Tatarka
. * Name: 4_1_homebase_build.do
. * Purpose: Take raw homebase data from 2018-2022 and combine into a single cl
> eaned homebase dataset
. 
. * Set Global File Paths
. global root = "/project/fagoolsb/service_industries/replication_package"

. global raw_root = "${root}/datasets/raw"

. global intermediate_root = "${root}/datasets/intermediate"

. global built_root = "${root}/datasets/built"

. global exhibits_root = "${root}/exhibits"

. 
. global homebase_root = "/project/fagoolsb/service_industries/data/datasets/ra
> w/homebase"

. 
. *****************************************************************************
> ****
. ***** 1. Bring in and append Raw data
. *****************************************************************************
> ***
. 
. *** Load in Jan 2018 - June 2021 Data and clean the data
. 
. local files : dir "${homebase_root}/hours_worked_dataset_historical_files" fi
> les "*"

. local i = 1

. foreach file in `files' {
  2. import delimited "${homebase_root}/hours_worked_dataset_historical_files/`
> file'", clear
  3. 
. drop msa industry business_type 
  4. 
. replace location_created_date =substr(location_created_date,1,10)
  5. replace job_created_date = substr(job_created_date,1,10)
  6. replace job_archived_date = "" if job_archived_date == "\N"
  7. replace job_archived_date = substr(job_archived_date,1,10)
  8. 
. gen emp_level = 1 if level == "Employee"
  9. replace emp_level = 2 if level == "Manager"
 10. replace emp_level = 3 if level == "General Manager"
 11. 
. label define emp_level 1 "Employee" 2 "Manager" 3 "General Manager", replace 
 12. 
. label values emp_level emp_level
 13. drop level 
 14. 
. foreach var in event_date job_archived_date job_created_date location_created
> _date{
 15. gen `var'_1 = date(`var', "YMD")
 16. drop `var' 
 17. rename `var'_1 `var'
 18. format `var' %td
 19. }
 20. 
. destring avg_hourly_wage_rate total_wages_earned naics_code, replace force 
 21. 
. tostring county_code, replace
 22. replace county_code = "" if county_code == "."
 23. replace county_code = "0" + county_code if strlen(county_code) < 5 & count
> y_code != ""
 24. 
. gen file_date = .
 25. 
. compress
 26. 
. order event_date, first 
 27. 
. tempfile homebase_`i'
 28. save `homebase_`i'', replace 
 29. 
. local i = `i' + 1
 30. }
(encoding automatically selected: ISO-8859-1)
(19 vars, 18,273,206 obs)
(18,273,206 real changes made)
(18,273,206 real changes made)
(0 real changes made)
(2,470,561 real changes made)
(1,539,914 missing values generated)
(1,340,695 real changes made)
(199,219 real changes made)
(15,802,645 missing values generated)
avg_hourly_wage_rate already numeric; no replace
total_wages_earned already numeric; no replace
naics_code already numeric; no replace
county_code was long now str5
(859,384 real changes made)
(4,236,828 real changes made)
(18,273,206 missing values generated)
  variable emp_level was float now byte
  variable event_date was float now int
  variable job_archived_date was float now int
  variable job_created_date was float now int
  variable location_created_date was float now int
  variable file_date was float now byte
  (255,824,884 bytes saved)
(file /scratch/jtatarka/14736945/St700248.000001 not found)
file /scratch/jtatarka/14736945/St700248.000001 saved as .dta format
(encoding automatically selected: ISO-8859-1)
(19 vars, 28,877,844 obs)
(28,877,844 real changes made)
(28,877,844 real changes made)
(0 real changes made)
(10,230,633 real changes made)
(2,760,737 missing values generated)
(2,425,996 real changes made)
(334,741 real changes made)
(18,647,211 missing values generated)
avg_hourly_wage_rate already numeric; no replace
total_wages_earned already numeric; no replace
naics_code already numeric; no replace
county_code was long now str5
(1,238,163 real changes made)
(6,624,264 real changes made)
(28,877,844 missing values generated)
  variable emp_level was float now byte
  variable event_date was float now int
  variable job_created_date was float now int
  variable location_created_date was float now int
  variable file_date was float now byte
  (346,534,128 bytes saved)
(file /scratch/jtatarka/14736945/St700248.000002 not found)
file /scratch/jtatarka/14736945/St700248.000002 saved as .dta format
(encoding automatically selected: ISO-8859-1)
(19 vars, 28,331,174 obs)
(28,331,174 real changes made)
(28,331,174 real changes made)
(0 real changes made)
(14,833,266 real changes made)
(2,866,664 missing values generated)
(2,612,785 real changes made)
(253,879 real changes made)
(13,497,908 missing values generated)
avg_hourly_wage_rate already numeric; no replace
total_wages_earned already numeric; no replace
naics_code already numeric; no replace
county_code was long now str5
(1,036,126 real changes made)
(6,780,404 real changes made)
(28,331,174 missing values generated)
  variable emp_level was float now byte
  variable event_date was float now int
  variable job_created_date was float now int
  variable location_created_date was float now int
  variable file_date was float now byte
  (339,974,088 bytes saved)
(file /scratch/jtatarka/14736945/St700248.000003 not found)
file /scratch/jtatarka/14736945/St700248.000003 saved as .dta format
(encoding automatically selected: ISO-8859-1)
(19 vars, 16,869,472 obs)
(16,869,472 real changes made)
(16,869,472 real changes made)
(0 real changes made)
(5,974,099 real changes made)
(1,613,077 missing values generated)
(1,417,953 real changes made)
(195,124 real changes made)
(10,895,373 missing values generated)
avg_hourly_wage_rate already numeric; no replace
total_wages_earned already numeric; no replace
naics_code already numeric; no replace
county_code was long now str5
(723,346 real changes made)
(3,871,486 real changes made)
(16,869,472 missing values generated)
  variable emp_level was float now byte
  variable event_date was float now int
  variable job_created_date was float now int
  variable location_created_date was float now int
  variable file_date was float now byte
  (202,433,664 bytes saved)
(file /scratch/jtatarka/14736945/St700248.000004 not found)
file /scratch/jtatarka/14736945/St700248.000004 saved as .dta format
(encoding automatically selected: ISO-8859-1)
(19 vars, 29,500,312 obs)
(29,500,312 real changes made)
(29,500,312 real changes made)
(0 real changes made)
(3,986,106 real changes made)
(2,481,699 missing values generated)
(2,160,614 real changes made)
(321,085 real changes made)
(25,514,206 missing values generated)
avg_hourly_wage_rate already numeric; no replace
total_wages_earned already numeric; no replace
naics_code already numeric; no replace
county_code was long now str5
(1,389,117 real changes made)
(6,841,199 real changes made)
(29,500,312 missing values generated)
  variable emp_level was float now byte
  variable event_date was float now int
  variable job_archived_date was float now int
  variable job_created_date was float now int
  variable location_created_date was float now int
  variable file_date was float now byte
  (413,004,368 bytes saved)
(file /scratch/jtatarka/14736945/St700248.000005 not found)
file /scratch/jtatarka/14736945/St700248.000005 saved as .dta format
(encoding automatically selected: ISO-8859-1)
(19 vars, 22,566,360 obs)
(22,566,360 real changes made)
(22,566,360 real changes made)
(0 real changes made)
(13,763,962 real changes made)
(2,761,924 missing values generated)
(2,576,254 real changes made)
(185,670 real changes made)
(8,802,398 missing values generated)
avg_hourly_wage_rate already numeric; no replace
total_wages_earned already numeric; no replace
naics_code already numeric; no replace
county_code was long now str5
(601,049 real changes made)
(5,605,211 real changes made)
(22,566,360 missing values generated)
  variable emp_level was float now byte
  variable event_date was float now int
  variable job_archived_date was float now int
  variable job_created_date was float now int
  variable location_created_date was float now int
  variable file_date was float now byte
  (315,929,040 bytes saved)
(file /scratch/jtatarka/14736945/St700248.000006 not found)
file /scratch/jtatarka/14736945/St700248.000006 saved as .dta format
(encoding automatically selected: ISO-8859-1)
(19 vars, 27,983,602 obs)
(27,983,602 real changes made)
(27,983,602 real changes made)
(0 real changes made)
(17,074,738 real changes made)
(3,425,218 missing values generated)
(3,195,746 real changes made)
(229,472 real changes made)
(10,908,864 missing values generated)
avg_hourly_wage_rate already numeric; no replace
total_wages_earned already numeric; no replace
naics_code already numeric; no replace
county_code was long now str5
(744,640 real changes made)
(6,950,788 real changes made)
(27,983,602 missing values generated)
  variable emp_level was float now byte
  variable event_date was float now int
  variable job_archived_date was float now int
  variable job_created_date was float now int
  variable location_created_date was float now int
  variable file_date was float now byte
  (391,770,428 bytes saved)
(file /scratch/jtatarka/14736945/St700248.000007 not found)
file /scratch/jtatarka/14736945/St700248.000007 saved as .dta format
(encoding automatically selected: ISO-8859-1)
(19 vars, 28,331,452 obs)
(28,331,452 real changes made)
(28,331,452 real changes made)
(0 real changes made)
(14,833,980 real changes made)
(2,865,967 missing values generated)
(2,611,183 real changes made)
(254,784 real changes made)
(13,497,472 missing values generated)
avg_hourly_wage_rate already numeric; no replace
total_wages_earned already numeric; no replace
naics_code already numeric; no replace
county_code was long now str5
(1,036,733 real changes made)
(6,780,694 real changes made)
(28,331,452 missing values generated)
  variable emp_level was float now byte
  variable event_date was float now int
  variable job_created_date was float now int
  variable location_created_date was float now int
  variable file_date was float now byte
  (339,977,424 bytes saved)
(file /scratch/jtatarka/14736945/St700248.000008 not found)
file /scratch/jtatarka/14736945/St700248.000008 saved as .dta format
(encoding automatically selected: ISO-8859-1)
(19 vars, 18,320,692 obs)
(18,320,692 real changes made)
(18,320,692 real changes made)
(0 real changes made)
(9,592,086 real changes made)
(1,855,237 missing values generated)
(1,690,673 real changes made)
(164,564 real changes made)
(8,728,606 missing values generated)
avg_hourly_wage_rate already numeric; no replace
total_wages_earned already numeric; no replace
naics_code already numeric; no replace
county_code was long now str5
(670,087 real changes made)
(4,383,005 real changes made)
(18,320,692 missing values generated)
  variable emp_level was float now byte
  variable event_date was float now int
  variable job_created_date was float now int
  variable location_created_date was float now int
  variable file_date was float now byte
  (219,848,304 bytes saved)
(file /scratch/jtatarka/14736945/St700248.000009 not found)
file /scratch/jtatarka/14736945/St700248.000009 saved as .dta format
(encoding automatically selected: ISO-8859-1)
(19 vars, 28,878,716 obs)
(28,878,716 real changes made)
(28,878,716 real changes made)
(0 real changes made)
(10,228,717 real changes made)
(2,763,651 missing values generated)
(2,429,578 real changes made)
(334,073 real changes made)
(18,649,999 missing values generated)
avg_hourly_wage_rate already numeric; no replace
total_wages_earned already numeric; no replace
naics_code already numeric; no replace
county_code was long now str5
(1,239,575 real changes made)
(6,625,623 real changes made)
(28,878,716 missing values generated)
  variable emp_level was float now byte
  variable event_date was float now int
  variable job_created_date was float now int
  variable location_created_date was float now int
  variable file_date was float now byte
  (346,544,592 bytes saved)
(file /scratch/jtatarka/14736945/St700248.00000a not found)
file /scratch/jtatarka/14736945/St700248.00000a saved as .dta format

. 
. use `homebase_1', clear 

. erase `homebase_1'

. foreach j of numlist 2/10 {
  2.         append using `homebase_`j''
  3.         erase `homebase_`j''
  4. }
(variable zip was str17, now str41 to accommodate using data's values)
(variable job_archived_date was int, now float to accommodate using data's
       values)
(label emp_level already defined)
(label emp_level already defined)
(label emp_level already defined)
(label emp_level already defined)
(label emp_level already defined)
(label emp_level already defined)
(variable zip was str41, now str47 to accommodate using data's values)
(label emp_level already defined)
(label emp_level already defined)
(label emp_level already defined)

. 
. 
. **** Now append data for months from July 2021 - Jan 2023 
. foreach month in "2021-07" "2021-08" "2021-09" "2021-10" "2021-11" "2021-12" 
> "2022-01" "2022-02" "2022-03" "2022-04" "2022-05" "2022-06" "2022-07" "2022-0
> 8" "2022-09" "2022-10" "2022-11" "2022-12" "2023-01"{
  2. 
. append using "${homebase_root}/hours_worked_only_daily_raw_data_files/month_d
> ta/`month'.dta"
  3. }
(variable total_wages_earned was float, now double to accommodate using
       data's values)
(variable avg_hourly_wage_rate was float, now double to accommodate using
       data's values)
(variable file_date was byte, now int to accommodate using data's values)
(label emp_level already defined)
(label emp_level already defined)
(label emp_level already defined)
(label emp_level already defined)
(label emp_level already defined)
(label emp_level already defined)
(label emp_level already defined)
(label emp_level already defined)
(label emp_level already defined)
(label emp_level already defined)
(label emp_level already defined)
(label emp_level already defined)
(label emp_level already defined)
(label emp_level already defined)
(label emp_level already defined)
(label emp_level already defined)
(label emp_level already defined)
(label emp_level already defined)
(label emp_level already defined)

. 
. **********************************************************************
. **** 2. Clean the Appended data
. **********************************************************************
. 
. *** drop duplicates 
. bys event_date user_id location_id: egen max_file_date = max(file_date)
(245,101,367 missing values generated)

. duplicates tag  event_date user_id location_id, gen(dup)

Duplicates in terms of event_date user_id location_id

. replace dup = 0 if file_date == max_file_date
(52,144,036 real changes made)

. 
. drop if dup > 0
(54,539,351 observations deleted)

. drop dup max_file_date

. 
. bys user_id event_date location_id: egen max_archive_date = max(job_archived_
> date)
(339,834,512 missing values generated)

. duplicates tag  event_date user_id location_id, gen(dup)

Duplicates in terms of event_date user_id location_id

. replace dup = 0 if job_archived_date == max_archive_date
(158 real changes made)

. drop if dup > 0
(86 observations deleted)

. drop dup max_archive_date

. 
. bys user_id event_date location_id: egen max_wage = max(total_wages_earned)
(170,148,793 missing values generated)

. duplicates tag  event_date user_id location_id, gen(dup)

Duplicates in terms of event_date user_id location_id

. replace dup = 0 if total_wages_earned == max_wage
(60 real changes made)

. drop if dup > 0
(12 observations deleted)

. drop dup max_wage

. 
. ** Keep from 2018-2022
. keep if event_date < date("2023-1-1", "YMD")
(10,534,493 observations deleted)

. 
. *** Only Keep Limited Service Restaurants
. keep if inlist(naics_code, 722513, 722514, 722515) 
(364,031,826 observations deleted)

. 
. *** Drop if not in the USA or if state variable is missing
. drop if state == "Not USA"
(1,452,305 observations deleted)

. replace state = "" if state == "Unclassified"
(918,301 real changes made)

. 
. * drop if county_code is missing or if establishment is in puerto rico
. drop if county_code == ""
(664,345 observations deleted)

. drop if state == "pr"
(1,243,990 observations deleted)

. 
. * drop if no hours worked or missing hours worked
. drop if hours_worked == 0 | hours_worked == .
(146,266 observations deleted)

. 
. *** Create some key variables 
. gen year_month_gs = ym(year(event_date), month(event_date))

. format year_month_gs %tm

. 
. gen shift_less_4 = (hours_worked <4)

. 
. gen less_4_hours = hours_worked if hours_worked <4
(59,843,156 missing values generated)

. 
. gen agg_hours = hours_worked

. 
. * Collapse to Monthly Level 
. collapse (mean) hours_worked shift_less_4 (sum) agg_hours less_4_hours, by(ye
> ar_month_gs)

. 
. ************************************************************
. *** 3. Save Intermediate Homebase Dataset 
. ************************************************************
. save "${built_root}/homebase_build.dta", replace
(file
    /project/fagoolsb/service_industries/replication_package/datasets/built/h
    > omebase_build.dta not found)
file
    /project/fagoolsb/service_industries/replication_package/datasets/built/h
    > omebase_build.dta saved

. 
. 
end of do-file


. 